{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Goal\n",
    "map cities from Indeed to cities in Advan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:42:56.314518Z",
     "iopub.status.busy": "2023-09-15T15:42:56.314225Z",
     "iopub.status.idle": "2023-09-15T15:42:56.490174Z",
     "shell.execute_reply": "2023-09-15T15:42:56.489400Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "import pickle\n",
    "import polars as pl\n",
    "\n",
    "sys.path.append(\"../\")\n",
    "\n",
    "# CHANGE\n",
    "path_data = '' # analysis data\n",
    "path_output = '' # curated data\n",
    "\n",
    "pl.Config.set_fmt_str_lengths(100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:42:56.494190Z",
     "iopub.status.busy": "2023-09-15T15:42:56.493909Z",
     "iopub.status.idle": "2023-09-15T15:43:00.622791Z",
     "shell.execute_reply": "2023-09-15T15:43:00.621930Z"
    }
   },
   "outputs": [],
   "source": [
    "# get Indeed dataset and clean it\n",
    "\n",
    "df_indeed = (\n",
    "    pl.read_parquet(\n",
    "        path_output + 'indeed_all_jobs.parquet', columns=['city', 'province', 'job_key'])\n",
    "    .groupby(['city', 'province'])\n",
    "    .agg(pl.count().alias('job_count'))\n",
    "    .filter(pl.col('city').is_not_null())\n",
    "    .sort('job_count', descending=True)\n",
    ")\n",
    "\n",
    "df_advan = (\n",
    "    pl.read_parquet(\n",
    "        path_output + 'advan_companies.parquet', columns=['city', 'province', 'company_name'])\n",
    "    .groupby(['city', 'province'])\n",
    "    .agg(pl.count().alias('company_count'))\n",
    "    .filter(pl.col('city').is_not_null())\n",
    "    .sort('company_count', descending=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:00.626797Z",
     "iopub.status.busy": "2023-09-15T15:43:00.626516Z",
     "iopub.status.idle": "2023-09-15T15:43:00.639300Z",
     "shell.execute_reply": "2023-09-15T15:43:00.638623Z"
    }
   },
   "outputs": [],
   "source": [
    "# create mapping for cities in both indeed and advan\n",
    "\n",
    "df_common_locations = df_indeed.join(df_advan, on=['city', 'province'], how='inner')\n",
    "common_locations = [\n",
    "    (city, province) for city, province \n",
    "    in zip(df_common_locations['city'], df_common_locations['province'])\n",
    "    ]\n",
    "mapping_cities = dict(zip(common_locations, common_locations))\n",
    "\n",
    "# add some stats\n",
    "\n",
    "perc_city_matches = df_common_locations['job_count'].sum() / df_indeed['job_count'].sum()\n",
    "print(f'number of cities in both indeed and advan: {len(df_common_locations)}')\n",
    "print(f'percentage perfect city matches: {perc_city_matches:.2%}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:00.685464Z",
     "iopub.status.busy": "2023-09-15T15:43:00.685012Z",
     "iopub.status.idle": "2023-09-15T15:43:00.715081Z",
     "shell.execute_reply": "2023-09-15T15:43:00.714344Z"
    }
   },
   "outputs": [],
   "source": [
    "# done by Anna on 26/07/2022\n",
    "\n",
    "mapping_other_cities = {\n",
    "    ('ottawa west', 'on') : [('ottawa', 'on'), ('kanata', 'on')],\n",
    "    ('ottawa east', 'on') : [('ottawa', 'on'), ('orleans', 'on')], \n",
    "    ('quebec city', 'qc'): ('quebec', 'qc'),\n",
    "    ('greater sudbury', 'on') : ('sudbury', 'on'), \n",
    "    ('greater sudbury', 'on') : ('sudbury', 'on'),\n",
    "    ('new sudbury','on') :  ('sudbury', 'on'), # There is also (sudbury, north, on) but not sure if that corresponds to sudbury as well\n",
    "    ('greater toronto area', 'on'): [\n",
    "        ('toronto', 'on'), ('whitby', 'on'), ('vaughan', 'on'),\n",
    "        ('uxbridge', 'on'), ('stouffville','on'), ('richmond hill', 'on'),\n",
    "        ('pickering', 'on'), ('oshawa', 'on'), ('oakville', 'on'),\n",
    "        ('newmarket', 'on'), ('milton', 'on'), ('markham', 'on'),\n",
    "        ('king', 'on'), ('glen williams', 'on'), ('east gwillimbury', 'on'),\n",
    "        ('caledon', 'on'), ('burlington', 'on'), ('brock', 'on'),\n",
    "        ('brampton', 'on'), ('aurora', 'on'), ('ajax', 'on'), ('acton', 'on'),\n",
    "        ('mississauga', 'on'), ('newcastle', 'on'), ('bowmanville', 'on'),\n",
    "        ('port perry', 'on'), ('keswick', 'on'), ('pefferlaw', 'on'),\n",
    "        ('sutton','on'),('udora','on') ## removed this one:,('willow beach')\n",
    "        ], # main cities of peel, durham, york and halton regions.\n",
    "    ('kitchener waterloo', 'on') : [('kitchener', 'on'), ('waterloo', 'on')],\n",
    "    ('saint bruno de montarville', 'qc') : [\n",
    "        ('saint bruno', 'qc'), # safegraph decided to keep it as saint bruno, but i checked couple of addresses and it is actually saint bruno de montarville\n",
    "        ('saint bruno de montarville', 'qc')\n",
    "        ],\n",
    "    ('chatham kent', 'on') : [\n",
    "        ('chatham', 'on'), ('wallaceburg','on'), ('tilbury', 'on'),\n",
    "        ('ridgetown', 'on'), ('wheatley', 'on'), ('dresden','on')\n",
    "        ], # chantam -kent is a single-tier municipality, I included its population cities                           \n",
    "    ('lower mainland', 'bc') : [\n",
    "        ('langley', 'bc'), ('abbotsford','bc'), ('surrey', 'bc'),\n",
    "        ('white rock', 'bc'), ('whistler','bc'), ('richmond', 'bc'),\n",
    "        ('pitt meadows', 'bc'), ('mission', 'bc'), ('hope','bc'),\n",
    "        ('chilliwack', 'bc'), ('belcarra', 'bc'), ('anmore', 'bc'),\n",
    "        ('new westminster', 'bc'), ('north vancouver','bc'), ('burnaby', 'bc'),\n",
    "        ('port coquitlam', 'bc'), ('coquitlam', 'bc'), ('port moody', 'bc'),\n",
    "        ('maple ridge','bc'),('delta', 'bc')\n",
    "        ], #seems like lower mainland cover vancouver regional district + some cities around it\n",
    "    ('niagara', 'on') : ('niagara falls', 'on'),\n",
    "    ('niagara falls centre', 'on') : ('niagara falls', 'on'),\n",
    "    ('niagara peninsula', 'on') : ('niagara falls', 'on'),\n",
    "    ('halton', 'on') : [\n",
    "        ('georgetown', 'on'), ('acton', 'on'), ('milton', 'on'),\n",
    "        ('oakville', 'on'), ('burlington', 'on')],\n",
    "        # might also contain halton hills, but not in safegraph\n",
    "    ('halton hills', 'on') : [\n",
    "        ('georgetown', 'on'), ('acton', 'on'), ('milton', 'on'),\n",
    "        ('oakville', 'on'), ('burlington', 'on')\n",
    "        ],\n",
    "    ('greater napanee', 'on') : ('napanee', 'on'),\n",
    "    ('rocky view', 'ab') : ('rocky view county', 'ab'),\n",
    "    ('metro vancouver regional district', 'bc' ) : [\n",
    "        ('white rock','bc'), ('west vancouver','bc'),  ('vancouver', 'bc'),\n",
    "        ('tsawwassen','bc'), ('surrey', 'bc'),  ('richmond', 'bc'),\n",
    "        ('port moody', 'bc'), ('port coquitlam', 'bc'), ('pitt meadows', 'bc'),\n",
    "        ('north vancouver', 'bc'), ('new westminster', 'bc'),\n",
    "        ('maple ridge', 'bc'), ('lions bay', 'bc'), ('langley', 'bc'),\n",
    "        ('delta', 'bc'), ('coquitlam', 'bc'), ('burnaby', 'bc'),\n",
    "        ('bowen island', 'bc'), ('belcarra', 'bc'), ('anmore', 'bc')\n",
    "        ],\n",
    "    ('metrotown', 'bc'): ('burnaby', 'bc'), # no such city as metrotown, it's a shopping center\n",
    "    ('hull', 'qc'): ('gatineau','qc'),\n",
    "    ('notre dame de grace', 'qc') : ('montreal', 'qc'),\n",
    "    ('beauport', 'qc') : ('quebec','qc'), # it is part of quebec city\n",
    "    ('peel', 'on') : [\n",
    "        ('mississauga', 'on'), ('brampton', 'on'), ('caledon', 'on')\n",
    "        ],\n",
    "    ('south surrey', 'bc') : ('surrey', 'bc'),\n",
    "    ('muskoka', 'on') : [\n",
    "        ('bala', 'on'), ('baysville', 'on'), ('bracebridge', 'on'),\n",
    "        ('dorset', 'on'), ('gravenhurst', 'on'), ('huntsville', 'on'),\n",
    "        ('minett', 'on'), ('port carling', 'on'), ('torrance', 'on'),\n",
    "        ('windermere', 'on')],\n",
    "    ('kawartha lakes', 'on') : [\n",
    "        ('cameron', 'on'), ('kirkfield', 'on'), ('lindsay', 'on'),\n",
    "        ('little britain', 'on'), ('oakwood', 'on')\n",
    "        ],\n",
    "    ('sainte foy', 'qc') : ('quebec', 'qc'),\n",
    "    ('durham region', 'on') : [\n",
    "        ('whitby', 'on'), ('uxbridge', 'on'), ('port perry', 'on'),\n",
    "        ('pickering', 'on'), ('oshawa', 'on'), ('newcastle', 'on'),\n",
    "        ('brock', 'on'), ('beaverton', 'on')\n",
    "        ],\n",
    "    ('lachenaie', 'qc') : ('terrebonne', 'qc'), # Used to be part of off-island suburb of Montreal, but now it is a city of Terrebonne\n",
    "    ('vancouver island', 'bc') : [\n",
    "        ('victoria', 'bc'), ('comox', 'bc'), ('ladysmith', 'bc'),\n",
    "        ('nanaimo', 'bc'), ('parksville', 'bc'), ('qualicum beach', 'bc'),\n",
    "        ('tofino', 'bc'), ('saanichton', 'bc'), ('north saanich', 'bc')\n",
    "        ],\n",
    "    ('charlesbourg', 'qc') : ('quebec', 'qc'), # Part of quebec city\n",
    "    ('whitchurch stouffville', 'on'): [('gormley','on'), ('stouffville', 'on')],\n",
    "    ('greater montreal area','qc') : [\n",
    "        ('terrebonne','qc'), ('saint jerome','qc'), ('repentigny','qc'),\n",
    "        ('montreal','qc'), ('mirabel','qc'), ('longueuil','qc'), ('laval','qc'),\n",
    "        ('dollard des ormeaux','qc'), ('brossard','qc')\n",
    "        ],\n",
    "    ('saint georges de beauce', 'qc') : ('saint georges','qc'),\n",
    "    ('georgina', 'on') : [\n",
    "        ('keswick', 'on'), ('pefferlaw', 'on'), ('sutton','on'), ('udora','on') \n",
    "        ## removed this one: ('willow beach')\n",
    "        ],\n",
    "    ('don mills', 'on'): [('north york', 'on'), ('toronto', 'on')],\n",
    "    ('saanich', 'bc'): [('victoria', 'bc'), ('saanichton', 'bc')], # saanich isnt in safegraph, used closest cities\n",
    "    ('barrhaven', 'on'): [('nepean','on'), ('ottawa', 'on')], \n",
    "    ('saint lin  laurentides', 'qc'): ('saint lin laurentides', 'qc'), # extra space in initial name was an issue\n",
    "    ('comox valley', 'bc') : ('comox', 'bc'),\n",
    "    ('wood buffalo', 'ab') : [\n",
    "        ('calling lake', 'ab'), ('red earth creek', 'ab'), ('wabasca', 'ab')\n",
    "        ], \n",
    "        \n",
    "    # newly added\n",
    "    ('rexdale', 'on') : [\n",
    "        ('toronto', 'on'), ('etobicoke', 'on')\n",
    "        ], #Rexdale is a neighbourhood of Toronto, located in Etobicoke \n",
    "    ('saint augustin', 'qc') : [\n",
    "        ('saint augustin de desmaures', 'qc'), ('saint augustin saguenay', 'qc'), ('saint augustin de desmaur', 'qc')\n",
    "        ], # added all location from Advan that contain augustin\n",
    "    ('saint henri', 'qc') : [\n",
    "        ('montreal', 'qc'), ('saint andre', 'qc'), ('saint henri de levis', 'qc')\n",
    "        ], #could be a municipality saint-henri nearby quebec city or a neighbourhood in montreal\n",
    "    ('lac saint jean', 'qc') : [\n",
    "        ('alma', 'qc'), ('dolbeau mistassini', 'qc'), ('saint felicien', 'qc'), ('roberval', 'qc'),\n",
    "        ('saguenay', 'qc'), ('saint bruno', 'qc'), ('saint bruno lac saint jean', 'qc'),\n",
    "        ('hebertville', 'qc'), ('hebertville station', 'qc')\n",
    "        ], #included main urban centers around lake saint jean (can come back to add smaller cities)\n",
    "    ('west island', 'qc') : [\n",
    "        ('montreal', 'qc'), ('dorval', 'qc'), ('pointe claire', 'qc'),\n",
    "        ('beaconsfield', 'qc'), ('kirkland', 'qc'), ('dollard des ormeaux', 'qc'), \n",
    "        ('dollard d ormeaux', 'qc'), ('baie durfe', 'qc'), ('sainte anne de bellevue', 'qc'), \n",
    "        ('senneville', 'qc'), ('pierrefonds', 'qc'), ('lile bizard', 'qc')\n",
    "        ], #The West Island (French: l'Ouest de l'île) is the unofficial name given to the city, towns and boroughs at the western end of the Island of Montreal, in Quebec, Canada\n",
    "    ('fleurimont', 'qc') : ('sherbrooke', 'qc'), \n",
    "    ('nunavik', 'qc') : ('kuujjuaq', 'qc'), #Nunavik is a vast territory located in the northernmost part of Quebec, kuujjuaq is its administrative capital\n",
    "    ('val belair', 'qc') : ('quebec city', 'qc'),\n",
    "    ('riviere des prairies', 'qc') : [\n",
    "        ('montreal', 'qc'), ('pointe aux trembles', 'qc')\n",
    "        ], #Rivière-des-Prairies–Pointe-aux-Trembles is a suburban borough on the eastern tip of the city of Montreal\n",
    "    ('town of mount royal', 'qc') : [\n",
    "        ('mont royal', 'qc'), ('ville mont royal', 'qc'), ('montreal', 'qc')\n",
    "        ],\n",
    "    ('doon', 'on') : ('kitchener', 'on'), #Doon is a suburban community and former village which is now a part of the city of Kitchener, Ontario\n",
    "    ('saint jean', 'qc') : ('saint jean sur richelieu', 'qc'),\n",
    "    ('sainte marie de beauce', 'qc') : ('sainte marie', 'qc'),\n",
    "    ('canal de la rive sud', 'qc') : ('montreal', 'qc'),\n",
    "    ('fraser valley regional district', 'bc') : ('chilliwack', 'bc'),\n",
    "    ('university of british columbia vancouver campus', 'bc') : ('vancouver', 'bc'),\n",
    "    ('toronto pearson international airport', 'on') : [('toronto', 'on'), ('mississauga', 'on')],\n",
    "    ('kananaskis village', 'ab') : ('kananaskis', 'ab'),\n",
    "    ('moncton area', 'nb') : ('moncton', 'nb'), \n",
    "    ('national capital region', 'on') : ('ottawa', 'on'),\n",
    "    ('sillery', 'qc') : ('quebec city', 'qc'), \n",
    "    ('toronto',\t'unknown') : ('toronto', 'on'),\n",
    "    ('saint romuald', 'qc') : [\n",
    "        ('saint romuald detchemin', 'qc'), ('saint romuald', 'qc')\n",
    "    ],\n",
    "    ('clarence rockland', 'on') : [\n",
    "        ('rockland', 'on'), ('prescott russell', 'on'), ('russell', 'on'), ('prescott', 'on')\n",
    "    ], \n",
    "    ('annacis',\t'bc') : [('delta', 'bc'), ('annacis island delta', 'bc')\n",
    "    ],\n",
    "    ('university of british columbia okanagan campus', 'bc') : ('kelowna', 'bc'),\n",
    "    (\"grand mere\", 'qc'): [(\"grand mere\", 'qc'), (\"grandmere\", 'qc')]\n",
    "}\n",
    "\n",
    "for source in mapping_other_cities:\n",
    "    mapping_cities[source] = mapping_other_cities[source]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:00.718482Z",
     "iopub.status.busy": "2023-09-15T15:43:00.718191Z",
     "iopub.status.idle": "2023-09-15T15:43:00.821243Z",
     "shell.execute_reply": "2023-09-15T15:43:00.820463Z"
    }
   },
   "outputs": [],
   "source": [
    "# check that source locations from mapping cities are all in Indeed (otherwise fuzzy_matching.py wont' work)\n",
    "\n",
    "df_check_indeed = (\n",
    "    pl.from_records(list(mapping_cities.keys()))\n",
    "    .transpose()\n",
    "    .rename({'column_0': 'city', 'column_1': 'province'})\n",
    "    .unique()\n",
    "    .join(df_indeed, on=['city', 'province'], how='anti')\n",
    ")\n",
    "\n",
    "assert len(df_check_indeed) == 0, 'Some source locations are not in Indeed (see df_check_indeed)'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:00.825072Z",
     "iopub.status.busy": "2023-09-15T15:43:00.824791Z",
     "iopub.status.idle": "2023-09-15T15:43:00.932681Z",
     "shell.execute_reply": "2023-09-15T15:43:00.931922Z"
    }
   },
   "outputs": [],
   "source": [
    "# check that target locations are all in Advan (otherwise fuzzy_matching.py wont' work)\n",
    "\n",
    "target_locations = []\n",
    "for locations in mapping_cities.values():\n",
    "    if isinstance(locations, list):\n",
    "        target_locations.extend(locations)\n",
    "    else:\n",
    "        target_locations.append(locations)\n",
    "\n",
    "\n",
    "df_check_advan = (\n",
    "    pl.from_records(target_locations)\n",
    "    .transpose()\n",
    "    .rename({'column_0': 'city', 'column_1': 'province'})\n",
    "    .unique()\n",
    "    .join(df_advan, on=['city', 'province'], how='anti')\n",
    ")\n",
    "\n",
    "assert len(df_check_advan) == 0, 'Some target locations are not in Advan (see df_check_advan)'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:00.936767Z",
     "iopub.status.busy": "2023-09-15T15:43:00.936435Z",
     "iopub.status.idle": "2023-09-15T15:43:01.050817Z",
     "shell.execute_reply": "2023-09-15T15:43:01.049872Z"
    }
   },
   "outputs": [],
   "source": [
    "# locations in Indeed that are not mapped to Advan\n",
    "\n",
    "df_mapped_locations = (\n",
    "    pl.from_records(list(mapping_cities.keys()))\n",
    "    .transpose()\n",
    "    .rename({'column_0': 'city', 'column_1': 'province'})\n",
    ")\n",
    "\n",
    "(\n",
    "    df_indeed\n",
    "    .join(df_mapped_locations, on=['city', 'province'], how='anti')\n",
    "    .filter(pl.col('job_count') >  1000)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:01.054463Z",
     "iopub.status.busy": "2023-09-15T15:43:01.054111Z",
     "iopub.status.idle": "2023-09-15T15:43:01.180126Z",
     "shell.execute_reply": "2023-09-15T15:43:01.179498Z"
    }
   },
   "outputs": [],
   "source": [
    "# locations in Advan that are not mapped:\n",
    "print('unused locations from advan:')\n",
    "(   \n",
    "    df_advan\n",
    "    .join(\n",
    "        pl.from_records(target_locations)\n",
    "        .transpose()\n",
    "        .rename({'column_0': 'city', 'column_1': 'province'}),\n",
    "        on=['city', 'province'],\n",
    "        how='anti'\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:01.183322Z",
     "iopub.status.busy": "2023-09-15T15:43:01.183061Z",
     "iopub.status.idle": "2023-09-15T15:43:01.190384Z",
     "shell.execute_reply": "2023-09-15T15:43:01.189300Z"
    }
   },
   "outputs": [],
   "source": [
    "# clean up dictionary to have values in list\n",
    "single_target = {\n",
    "    source: [target]  # put target in list\n",
    "    for source, target in mapping_cities.items()\n",
    "    if not isinstance(target, list)\n",
    "    }\n",
    "\n",
    "multiple_targets = {\n",
    "    source: target\n",
    "    for source, target in mapping_cities.items()\n",
    "    if isinstance(target, list)\n",
    "    } \n",
    "\n",
    "mapping_cities = single_target | multiple_targets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-15T15:43:01.195978Z",
     "iopub.status.busy": "2023-09-15T15:43:01.195557Z",
     "iopub.status.idle": "2023-09-15T15:43:01.208886Z",
     "shell.execute_reply": "2023-09-15T15:43:01.207795Z"
    }
   },
   "outputs": [],
   "source": [
    "# save dictionary\n",
    "with open(path_data + 'mapping_cities.pickle', 'wb') as handle:\n",
    "    pickle.dump(mapping_cities, handle)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.12 ('env_indeed2')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "7120590dfa35e6512fb14e5e70b67446c3e78c7a5c027e908dbb14d6a3f8a0eb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
